This notebook helps train a recommendation system based on student ratings on questions. The predicted ratings of the recommender system would be stored in a MySQL table for the linear regression and clustering models to use.
from matplotlib import projections
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
import sqltodf.sqltodf as sqltodf
To get data from the database, utilize IRS's database app service to get the recommender data from the recommenderData caching table.
df = sqltodf.getDatabaseAppService().getRecommenderData(limit=10000000)
df
| question_id | score | duration | rating | user_id | |
|---|---|---|---|---|---|
| 0 | 1 | 100 | 22 | 5 | 2685 |
| 1 | 4 | 100 | 59 | 5 | 2592 |
| 2 | 4 | 100 | 18 | 1 | 2600 |
| 3 | 4 | 100 | 66 | 3 | 2609 |
| 4 | 4 | 100 | 49 | 2 | 2613 |
| ... | ... | ... | ... | ... | ... |
| 183373 | 3609 | 0 | 90 | 4 | 2289 |
| 183374 | 3609 | 0 | 119 | 4 | 2298 |
| 183375 | 3609 | 0 | 29 | 4 | 2298 |
| 183376 | 3609 | 100 | 203 | 1 | 1738 |
| 183377 | 3609 | 100 | 90 | 5 | 2040 |
183378 rows × 5 columns
Visualize the raw data from the MySQL server to find potential clusters of users.
question_3166_df = df[(df['question_id'] == 3166)]
question_3166_df = question_3166_df[['rating', 'user_id']].groupby('user_id').mean()
question_3166_df = question_3166_df.rename(columns={"rating":"3166_ratings"})
question_2181_df = df[(df['question_id'] == 2181)]
question_2181_df = question_2181_df[['rating', 'user_id']].groupby('user_id').mean()
question_2181_df = question_2181_df.rename(columns={"rating":"2181_ratings"})
question_344_df = df[(df['question_id'] == 344)]
question_344_df = question_344_df[['rating', 'user_id']].groupby('user_id').mean()
question_344_df = question_344_df.rename(columns={"rating":"344_ratings"})
graph_df = question_3166_df.merge(question_2181_df, on="user_id").merge(question_344_df, on="user_id")
graph_df
| 3166_ratings | 2181_ratings | 344_ratings | |
|---|---|---|---|
| user_id | |||
| 1315 | 1.0 | 1.0 | 1.0 |
| 1502 | 2.0 | 1.0 | 1.0 |
| 1510 | 1.0 | 1.0 | 2.0 |
| 1544 | 3.0 | 4.0 | 3.0 |
| 1558 | 1.0 | 1.0 | 1.0 |
| ... | ... | ... | ... |
| 5194 | 2.0 | 1.0 | 3.0 |
| 5313 | 1.0 | 1.0 | 1.0 |
| 5317 | 3.0 | 4.0 | 3.0 |
| 5320 | 1.0 | 2.0 | 2.0 |
| 5536 | 1.0 | 1.0 | 1.0 |
411 rows × 3 columns
fig = px.scatter_3d(graph_df,
x='3166_ratings',
y='2181_ratings',
z='344_ratings',
labels={'3166_ratings':'Question 3166 Rating',
'2181_ratings':'Question 2181 Rating',
'344_ratings':'Question 344 Rating'},
color=graph_df.index,
width=800,
height=800)
fig.update_coloraxes(showscale=False)
fig.show()
Use Surprise's data reader to create dataset with the returned data frame of users, questions, and ratings.
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user_id", "question_id", "rating"]], reader)
Split the dataset to a training and testing set.
from surprise import accuracy
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)
Train the recommender system with the K-Nearest Neighbors algorithm. Test the system with the test set and print out the root means squared error.
from surprise import SVD, KNNWithMeans, NormalPredictor, SVDpp, KNNWithMeans
sim_options = {
"name": "msd",
"min_support": 5,
"user_based": True,
}
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)
predictions = algo.test(testset)
print(accuracy.rmse(predictions))
Computing the msd similarity matrix... Done computing similarity matrix. RMSE: 0.8766 0.8766088654393576
Predict how user_3889 would rate question 23
algo.predict(3889, 23).est
2.791860862483913
Create a rating heatmap of the model.
def createRandomMatrixGraph(df, samples=25):
sample_users = df['user_id'].sample(n=samples).astype(str)
sample_questions = df['question_id'].sample(n=samples).astype(str)
heat_map = pd.DataFrame(columns=sample_users, index=sample_questions)
for user in sample_users:
for question in sample_questions:
estimated_rating = algo.predict(int(user), int(question)).est
heat_map.loc[question, user] = estimated_rating
return px.imshow(heat_map)
createRandomMatrixGraph(df, samples=25).show()
Get questions unrated from users from the database.
new_df = sqltodf.getDatabaseAppService().getUnlabeledRecommenderData(limit=10000000)
new_df
| question_id | score | duration | rating | user_id | |
|---|---|---|---|---|---|
| 0 | 1 | 50 | 27 | NaN | 4745 |
| 1 | 1 | 100 | 852 | NaN | 3889 |
| 2 | 1 | 0 | 4 | NaN | 3537 |
| 3 | 1 | 50 | 63 | NaN | 3153 |
| 4 | 1 | 100 | 22 | 5.0 | 2685 |
| ... | ... | ... | ... | ... | ... |
| 1082908 | 3609 | 100 | 137 | NaN | 4388 |
| 1082909 | 3609 | 100 | 51 | NaN | 4393 |
| 1082910 | 3609 | 100 | 186 | NaN | 4394 |
| 1082911 | 3609 | 0 | 103 | 5.0 | 4395 |
| 1082912 | 3609 | 0 | 15 | NaN | 4396 |
1082913 rows × 5 columns
For IRS purposes, use the recommender system to predict ratings and place predicted ratings in a new caching table.
def apply_prediction(row):
if row['rating'] != row['rating']:
row['predicted_rating'] = algo.predict(row['user_id'], row['question_id']).est
else:
row['predicted_rating'] = row['rating']
return row
new_df = new_df.apply(apply_prediction, axis=1)
new_df.to_csv("predicted_ratings.csv")
sqltodf.getDatabaseAppService().createPredictedRatingsTable(masterDataFrame=new_df)
Table predictedRatings created successfully.
C:\Users\richa\anaconda3\envs\its\lib\site-packages\pandas\io\sql.py:1685: UserWarning: The provided table name 'predictedRatings' is not found exactly as such in the database after writing the table, possibly due to case sensitivity issues. Consider using lower case table names. warnings.warn(msg, UserWarning)